library('tidyverse')
library('glmnet')
library('caret')
library('pROC')
library('rsample')
Modeling for Default
Modeling default for loan applications with various techniques
Approach
I’m focusing on modeling for DEFAULT
using Penalized Regression.
Prep
Let’s load a few libraries:
We’ll load the cleaned, balanced, training data:
<- 'D:/All Repos/home-credit-default-risk-group/data/application_train_smote.csv'
path
<- data.table::fread(path) |>
data as.data.frame()
glimpse(data)
Rows: 555,256
Columns: 181
$ CASH_LOAN.N <dbl> 0, 0, 0, 0, 0, 0, 0,…
$ CASH_LOAN.Y <dbl> 1, 1, 1, 1, 1, 1, 1,…
$ GENDER_MALE.N <dbl> 0, 1, 1, 1, 0, 1, 1,…
$ GENDER_MALE.Y <dbl> 1, 0, 0, 0, 1, 0, 0,…
$ FLAG_OWN_CAR.N <dbl> 0, 0, 1, 1, 0, 1, 1,…
$ FLAG_OWN_CAR.Y <dbl> 1, 1, 0, 0, 1, 0, 0,…
$ FLAG_OWN_REALTY.N <dbl> 1, 0, 0, 0, 1, 1, 0,…
$ FLAG_OWN_REALTY.Y <dbl> 0, 1, 1, 1, 0, 0, 1,…
$ CNT_CHILDREN <dbl> 0, 1, 0, 0, 0, 1, 0,…
$ AMT_INCOME_TOTAL <dbl> 90000, 252000, 31500…
$ AMT_CREDIT <dbl> 263686.5, 675000.0, …
$ AMT_ANNUITY <dbl> 19237.5, 53460.0, 52…
$ AMT_GOODS_PRICE <dbl> 238500, 675000, 9000…
$ NAME_TYPE_SUITE.Children <dbl> 0, 0, 0, 0, 0, 0, 0,…
$ NAME_TYPE_SUITE.Family <dbl> 0, 1, 0, 0, 0, 0, 1,…
$ NAME_TYPE_SUITE.Group.of.people <dbl> 0, 0, 0, 0, 0, 0, 0,…
$ NAME_TYPE_SUITE.Other_A <dbl> 0, 0, 0, 0, 0, 0, 0,…
$ NAME_TYPE_SUITE.Other_B <dbl> 0, 0, 0, 0, 0, 0, 0,…
$ NAME_TYPE_SUITE.Spouse..partner <dbl> 0, 0, 0, 0, 0, 0, 0,…
$ NAME_TYPE_SUITE.Unaccompanied <dbl> 1, 0, 1, 1, 1, 1, 0,…
$ NAME_INCOME_TYPE.Businessman <int> 0, 0, 0, 0, 0, 0, 0,…
$ NAME_INCOME_TYPE.Commercial.associate <dbl> 0, 0, 1, 1, 0, 0, 0,…
$ NAME_INCOME_TYPE.Maternity.leave <dbl> 0, 0, 0, 0, 0, 0, 0,…
$ NAME_INCOME_TYPE.Pensioner <dbl> 0, 0, 0, 0, 0, 0, 1,…
$ NAME_INCOME_TYPE.State.servant <dbl> 0, 0, 0, 0, 0, 0, 0,…
$ NAME_INCOME_TYPE.Student <int> 0, 0, 0, 0, 0, 0, 0,…
$ NAME_INCOME_TYPE.Unemployed <dbl> 0, 0, 0, 0, 0, 0, 0,…
$ NAME_INCOME_TYPE.Working <dbl> 1, 1, 0, 0, 1, 1, 0,…
$ NAME_EDUCATION_TYPE.Academic.degree <dbl> 0, 0, 0, 0, 0, 0, 0,…
$ NAME_EDUCATION_TYPE.Higher.education <dbl> 0, 0, 0, 0, 0, 0, 0,…
$ NAME_EDUCATION_TYPE.Incomplete.higher <dbl> 0, 0, 0, 0, 0, 0, 0,…
$ NAME_EDUCATION_TYPE.Lower.secondary <dbl> 0, 0, 0, 0, 0, 0, 0,…
$ NAME_EDUCATION_TYPE.Secondary...secondary.special <dbl> 1, 1, 1, 1, 1, 1, 1,…
$ NAME_FAMILY_STATUS.Civil.marriage <dbl> 0, 0, 0, 0, 0, 1, 0,…
$ NAME_FAMILY_STATUS.Married <dbl> 1, 1, 1, 0, 1, 0, 0,…
$ NAME_FAMILY_STATUS.Separated <dbl> 0, 0, 0, 0, 0, 0, 0,…
$ NAME_FAMILY_STATUS.Single...not.married <dbl> 0, 0, 0, 1, 0, 0, 0,…
$ NAME_FAMILY_STATUS.Widow <dbl> 0, 0, 0, 0, 0, 0, 1,…
$ NAME_HOUSING_TYPE.Co.op.apartment <dbl> 0, 0, 0, 0, 0, 0, 0,…
$ NAME_HOUSING_TYPE.House...apartment <dbl> 0, 1, 1, 0, 1, 1, 1,…
$ NAME_HOUSING_TYPE.Municipal.apartment <dbl> 0, 0, 0, 0, 0, 0, 0,…
$ NAME_HOUSING_TYPE.Office.apartment <dbl> 0, 0, 0, 0, 0, 0, 0,…
$ NAME_HOUSING_TYPE.Rented.apartment <dbl> 0, 0, 0, 0, 0, 0, 0,…
$ NAME_HOUSING_TYPE.With.parents <dbl> 1, 0, 0, 1, 0, 0, 0,…
$ REGION_POPULATION_RELATIVE <dbl> 0.002134, 0.014520, …
$ DAYS_BIRTH <dbl> -10042, -10464, -172…
$ DAYS_EMPLOYED <dbl> -1598, -2245, -195, …
$ DAYS_REGISTRATION <dbl> -167, -169, -11387, …
$ DAYS_ID_PUBLISH <dbl> -2529, -3111, -180, …
$ OWN_CAR_AGE <dbl> 19, 15, 0, 0, 4, 0, …
$ FLAG_EMP_PHONE.N <dbl> 0, 0, 0, 0, 0, 0, 1,…
$ FLAG_EMP_PHONE.Y <dbl> 1, 1, 1, 1, 1, 1, 0,…
$ FLAG_WORK_PHONE.N <dbl> 1, 1, 0, 1, 1, 1, 1,…
$ FLAG_WORK_PHONE.Y <dbl> 0, 0, 1, 0, 0, 0, 0,…
$ FLAG_CONT_MOBILE.N <dbl> 0, 0, 0, 0, 0, 0, 0,…
$ FLAG_CONT_MOBILE.Y <dbl> 1, 1, 1, 1, 1, 1, 1,…
$ FLAG_PHONE.N <dbl> 1, 1, 0, 1, 1, 1, 0,…
$ FLAG_PHONE.Y <dbl> 0, 0, 1, 0, 0, 0, 1,…
$ FLAG_EMAIL.N <dbl> 1, 1, 1, 1, 1, 1, 1,…
$ FLAG_EMAIL.Y <dbl> 0, 0, 0, 0, 0, 0, 0,…
$ OCCUPATION_TYPE.Accountants <dbl> 0, 0, 1, 0, 0, 0, 0,…
$ OCCUPATION_TYPE.Cleaning.staff <dbl> 0, 0, 0, 0, 0, 0, 0,…
$ OCCUPATION_TYPE.Cooking.staff <dbl> 0, 0, 0, 0, 1, 0, 0,…
$ OCCUPATION_TYPE.Core.staff <dbl> 0, 0, 0, 0, 0, 0, 0,…
$ OCCUPATION_TYPE.Drivers <dbl> 1, 0, 0, 0, 0, 0, 0,…
$ OCCUPATION_TYPE.High.skill.tech.staff <dbl> 0, 0, 0, 0, 0, 0, 0,…
$ OCCUPATION_TYPE.HR.staff <dbl> 0, 0, 0, 0, 0, 0, 0,…
$ OCCUPATION_TYPE.IT.staff <dbl> 0, 0, 0, 0, 0, 0, 0,…
$ OCCUPATION_TYPE.Laborers <dbl> 0, 0, 0, 0, 0, 1, 0,…
$ OCCUPATION_TYPE.Low.skill.Laborers <dbl> 0, 0, 0, 0, 0, 0, 0,…
$ OCCUPATION_TYPE.Managers <dbl> 0, 0, 0, 0, 0, 0, 0,…
$ OCCUPATION_TYPE.Medicine.staff <dbl> 0, 0, 0, 0, 0, 0, 0,…
$ OCCUPATION_TYPE.Private.service.staff <dbl> 0, 0, 0, 0, 0, 0, 0,…
$ OCCUPATION_TYPE.Realty.agents <dbl> 0, 0, 0, 0, 0, 0, 0,…
$ OCCUPATION_TYPE.Sales.staff <dbl> 0, 1, 0, 0, 0, 0, 0,…
$ OCCUPATION_TYPE.Secretaries <dbl> 0, 0, 0, 0, 0, 0, 0,…
$ OCCUPATION_TYPE.Security.staff <dbl> 0, 0, 0, 0, 0, 0, 0,…
$ OCCUPATION_TYPE.Waiters.barmen.staff <dbl> 0, 0, 0, 1, 0, 0, 0,…
$ OCCUPATION_TYPE.XNA <dbl> 0, 0, 0, 0, 0, 0, 1,…
$ CNT_FAM_MEMBERS <dbl> 2, 3, 2, 1, 2, 3, 1,…
$ REGION_RATING_CLIENT <dbl> 3, 2, 1, 2, 2, 2, 2,…
$ REGION_RATING_CLIENT_W_CITY <dbl> 3, 2, 1, 2, 2, 2, 2,…
$ WEEKDAY_APPR_PROCESS_START.FRIDAY <dbl> 1, 1, 0, 1, 0, 0, 0,…
$ WEEKDAY_APPR_PROCESS_START.MONDAY <dbl> 0, 0, 0, 0, 0, 0, 0,…
$ WEEKDAY_APPR_PROCESS_START.SATURDAY <dbl> 0, 0, 0, 0, 0, 0, 0,…
$ WEEKDAY_APPR_PROCESS_START.SUNDAY <dbl> 0, 0, 1, 0, 0, 0, 0,…
$ WEEKDAY_APPR_PROCESS_START.THURSDAY <dbl> 0, 0, 0, 0, 0, 1, 1,…
$ WEEKDAY_APPR_PROCESS_START.TUESDAY <dbl> 0, 0, 0, 0, 0, 0, 0,…
$ WEEKDAY_APPR_PROCESS_START.WEDNESDAY <dbl> 0, 0, 0, 0, 1, 0, 0,…
$ HOUR_APPR_PROCESS_START <dbl> 8, 12, 10, 13, 15, 1…
$ REG_REGION_NOT_LIVE_REGION.N <dbl> 1, 1, 1, 1, 1, 1, 1,…
$ REG_REGION_NOT_LIVE_REGION.Y <dbl> 0, 0, 0, 0, 0, 0, 0,…
$ REG_REGION_NOT_WORK_REGION.N <dbl> 1, 1, 1, 1, 1, 1, 1,…
$ REG_REGION_NOT_WORK_REGION.Y <dbl> 0, 0, 0, 0, 0, 0, 0,…
$ LIVE_REGION_NOT_WORK_REGION.N <dbl> 1, 1, 1, 1, 1, 1, 1,…
$ LIVE_REGION_NOT_WORK_REGION.Y <dbl> 0, 0, 0, 0, 0, 0, 0,…
$ REG_CITY_NOT_LIVE_CITY.N <dbl> 1, 1, 1, 0, 0, 1, 1,…
$ REG_CITY_NOT_LIVE_CITY.Y <dbl> 0, 0, 0, 1, 1, 0, 0,…
$ REG_CITY_NOT_WORK_CITY.N <dbl> 1, 1, 1, 0, 0, 1, 1,…
$ REG_CITY_NOT_WORK_CITY.Y <dbl> 0, 0, 0, 1, 1, 0, 0,…
$ LIVE_CITY_NOT_WORK_CITY.N <dbl> 1, 1, 1, 1, 1, 1, 1,…
$ LIVE_CITY_NOT_WORK_CITY.Y <dbl> 0, 0, 0, 0, 0, 0, 0,…
$ ORGANIZATION_TYPE.Advertising <dbl> 0, 0, 0, 0, 0, 0, 0,…
$ ORGANIZATION_TYPE.Agriculture <dbl> 0, 0, 0, 0, 0, 0, 0,…
$ ORGANIZATION_TYPE.Bank <dbl> 0, 0, 0, 0, 0, 0, 0,…
$ ORGANIZATION_TYPE.Business.Entity.Type.1 <dbl> 0, 0, 0, 0, 0, 0, 0,…
$ ORGANIZATION_TYPE.Business.Entity.Type.2 <dbl> 0, 0, 0, 0, 0, 0, 0,…
$ ORGANIZATION_TYPE.Business.Entity.Type.3 <dbl> 1, 0, 1, 0, 0, 0, 0,…
$ ORGANIZATION_TYPE.Cleaning <dbl> 0, 0, 0, 0, 0, 0, 0,…
$ ORGANIZATION_TYPE.Construction <dbl> 0, 0, 0, 0, 0, 0, 0,…
$ ORGANIZATION_TYPE.Culture <dbl> 0, 0, 0, 0, 0, 0, 0,…
$ ORGANIZATION_TYPE.Electricity <dbl> 0, 0, 0, 0, 0, 0, 0,…
$ ORGANIZATION_TYPE.Emergency <dbl> 0, 0, 0, 0, 0, 0, 0,…
$ ORGANIZATION_TYPE.Government <dbl> 0, 0, 0, 0, 0, 0, 0,…
$ ORGANIZATION_TYPE.Hotel <dbl> 0, 0, 0, 0, 0, 0, 0,…
$ ORGANIZATION_TYPE.Housing <dbl> 0, 0, 0, 0, 0, 0, 0,…
$ ORGANIZATION_TYPE.Industry..type.1 <dbl> 0, 0, 0, 0, 0, 0, 0,…
$ ORGANIZATION_TYPE.Industry..type.10 <dbl> 0, 0, 0, 0, 0, 0, 0,…
$ ORGANIZATION_TYPE.Industry..type.11 <dbl> 0, 0, 0, 0, 0, 0, 0,…
$ ORGANIZATION_TYPE.Industry..type.12 <dbl> 0, 0, 0, 0, 0, 0, 0,…
$ ORGANIZATION_TYPE.Industry..type.13 <dbl> 0, 0, 0, 0, 0, 0, 0,…
$ ORGANIZATION_TYPE.Industry..type.2 <dbl> 0, 0, 0, 0, 0, 0, 0,…
$ ORGANIZATION_TYPE.Industry..type.3 <dbl> 0, 0, 0, 0, 0, 0, 0,…
$ ORGANIZATION_TYPE.Industry..type.4 <dbl> 0, 1, 0, 0, 0, 0, 0,…
$ ORGANIZATION_TYPE.Industry..type.5 <dbl> 0, 0, 0, 0, 0, 0, 0,…
$ ORGANIZATION_TYPE.Industry..type.6 <dbl> 0, 0, 0, 0, 0, 0, 0,…
$ ORGANIZATION_TYPE.Industry..type.7 <dbl> 0, 0, 0, 0, 0, 0, 0,…
$ ORGANIZATION_TYPE.Industry..type.8 <dbl> 0, 0, 0, 0, 0, 0, 0,…
$ ORGANIZATION_TYPE.Industry..type.9 <dbl> 0, 0, 0, 0, 0, 0, 0,…
$ ORGANIZATION_TYPE.Insurance <dbl> 0, 0, 0, 0, 0, 0, 0,…
$ ORGANIZATION_TYPE.Kindergarten <dbl> 0, 0, 0, 0, 0, 0, 0,…
$ ORGANIZATION_TYPE.Legal.Services <dbl> 0, 0, 0, 0, 0, 0, 0,…
$ ORGANIZATION_TYPE.Medicine <dbl> 0, 0, 0, 0, 0, 0, 0,…
$ ORGANIZATION_TYPE.Military <dbl> 0, 0, 0, 0, 0, 0, 0,…
$ ORGANIZATION_TYPE.Mobile <dbl> 0, 0, 0, 0, 0, 0, 0,…
$ ORGANIZATION_TYPE.Other <dbl> 0, 0, 0, 0, 0, 0, 0,…
$ ORGANIZATION_TYPE.Police <dbl> 0, 0, 0, 0, 0, 0, 0,…
$ ORGANIZATION_TYPE.Postal <dbl> 0, 0, 0, 0, 0, 1, 0,…
$ ORGANIZATION_TYPE.Realtor <dbl> 0, 0, 0, 0, 0, 0, 0,…
$ ORGANIZATION_TYPE.Religion <dbl> 0, 0, 0, 0, 0, 0, 0,…
$ ORGANIZATION_TYPE.Restaurant <dbl> 0, 0, 0, 1, 1, 0, 0,…
$ ORGANIZATION_TYPE.School <dbl> 0, 0, 0, 0, 0, 0, 0,…
$ ORGANIZATION_TYPE.Security <dbl> 0, 0, 0, 0, 0, 0, 0,…
$ ORGANIZATION_TYPE.Security.Ministries <dbl> 0, 0, 0, 0, 0, 0, 0,…
$ ORGANIZATION_TYPE.Self.employed <dbl> 0, 0, 0, 0, 0, 0, 0,…
$ ORGANIZATION_TYPE.Services <dbl> 0, 0, 0, 0, 0, 0, 0,…
$ ORGANIZATION_TYPE.Telecom <dbl> 0, 0, 0, 0, 0, 0, 0,…
$ ORGANIZATION_TYPE.Trade..type.1 <dbl> 0, 0, 0, 0, 0, 0, 0,…
$ ORGANIZATION_TYPE.Trade..type.2 <dbl> 0, 0, 0, 0, 0, 0, 0,…
$ ORGANIZATION_TYPE.Trade..type.3 <dbl> 0, 0, 0, 0, 0, 0, 0,…
$ ORGANIZATION_TYPE.Trade..type.4 <dbl> 0, 0, 0, 0, 0, 0, 0,…
$ ORGANIZATION_TYPE.Trade..type.5 <dbl> 0, 0, 0, 0, 0, 0, 0,…
$ ORGANIZATION_TYPE.Trade..type.6 <dbl> 0, 0, 0, 0, 0, 0, 0,…
$ ORGANIZATION_TYPE.Trade..type.7 <dbl> 0, 0, 0, 0, 0, 0, 0,…
$ ORGANIZATION_TYPE.Transport..type.1 <dbl> 0, 0, 0, 0, 0, 0, 0,…
$ ORGANIZATION_TYPE.Transport..type.2 <dbl> 0, 0, 0, 0, 0, 0, 0,…
$ ORGANIZATION_TYPE.Transport..type.3 <dbl> 0, 0, 0, 0, 0, 0, 0,…
$ ORGANIZATION_TYPE.Transport..type.4 <dbl> 0, 0, 0, 0, 0, 0, 0,…
$ ORGANIZATION_TYPE.University <dbl> 0, 0, 0, 0, 0, 0, 0,…
$ ORGANIZATION_TYPE.XNA <dbl> 0, 0, 0, 0, 0, 0, 1,…
$ EXT_SOURCE_1 <dbl> 0.5021294, 0.4415526…
$ EXT_SOURCE_2 <dbl> 0.149016484, 0.48337…
$ EXT_SOURCE_3 <dbl> 0.3996756, 0.5406545…
$ OBS_30_CNT_SOCIAL_CIRCLE <dbl> 2, 0, 0, 1, 0, 1, 0,…
$ DEF_30_CNT_SOCIAL_CIRCLE <dbl> 1, 0, 0, 0, 0, 1, 0,…
$ OBS_60_CNT_SOCIAL_CIRCLE <dbl> 2, 0, 0, 1, 0, 1, 0,…
$ DEF_60_CNT_SOCIAL_CIRCLE <dbl> 1, 0, 0, 0, 0, 1, 0,…
$ DAYS_LAST_PHONE_CHANGE <dbl> -889, -1043, 0, -251…
$ AMT_REQ_CREDIT_BUREAU_HOUR <dbl> 0, 0, 0, 0, 0, 0, 0,…
$ AMT_REQ_CREDIT_BUREAU_DAY <dbl> 0, 0, 0, 0, 0, 0, 0,…
$ AMT_REQ_CREDIT_BUREAU_WEEK <dbl> 0, 0, 1, 0, 0, 0, 0,…
$ AMT_REQ_CREDIT_BUREAU_MON <dbl> 0, 0, 0, 0, 0, 0, 1,…
$ AMT_REQ_CREDIT_BUREAU_QRT <dbl> 0, 0, 0, 0, 0, 0, 0,…
$ AMT_REQ_CREDIT_BUREAU_YEAR <dbl> 2, 1, 1, 1, 4, 1, 1,…
$ IMPUTED_EXT1.N <dbl> 0, 1, 1, 1, 1, 1, 0,…
$ IMPUTED_EXT1.Y <dbl> 1, 0, 0, 0, 0, 0, 1,…
$ IMPUTED_EXT2.N <dbl> 1, 1, 1, 1, 1, 1, 1,…
$ IMPUTED_EXT2.Y <dbl> 0, 0, 0, 0, 0, 0, 0,…
$ IMPUTED_EXT3.N <dbl> 1, 1, 1, 1, 1, 1, 1,…
$ IMPUTED_EXT3.Y <dbl> 0, 0, 0, 0, 0, 0, 0,…
$ DEFAULT <chr> "Y", "Y", "Y", "Y", …
We’ll work with a sample of the data since we have over 300K records.
set.seed(814)
# Splits
<- initial_split(data)
split_obj <- training(split_obj)
full_train <- testing(split_obj)
full_test
# 25% sample of the data
<- sample_n(
train_sampl ceiling(nrow(full_train) * 0.25)
full_train,
)
<- sample_n(
test_sampl ceiling(nrow(full_test) * 0.25)
full_test, )
All features in this file may conceivably be helpful in predicting DEFAULT
(originally named TARGET
) so we shouldn’t need to subset any of them.
Penalized Regression
Prep
We have little to no way of knowing which will be predictive. Therefore, an elastic net is probably the best choice.
Let’s prepare the data for penalized regression. glmnet
requires split predictors and target, with the latter being in a vector and the former being cast as a matrix.
<- train_sampl$DEFAULT
train_target <- as.matrix(
train_predictors |> select(-DEFAULT)
train_sampl )
We’ll also do it for the test sample data.
<- test_sampl$DEFAULT
test_target <- as.matrix(
test_predictors |> select(-DEFAULT)
test_sampl )
Basic Modeling
Let’s run an elastic net model leveraging cross validation. We’ll use the default 10 folds. The output we get will measure AUC
and the standard error thereof.
<- cv.glmnet(
mod1 x = train_predictors,
y = train_target,
family = "binomial",
alpha = 0.5,
type.measure = "auc"
)
mod1
Call: cv.glmnet(x = train_predictors, y = train_target, type.measure = "auc", family = "binomial", alpha = 0.5)
Measure: AUC
Lambda Index Measure SE Nonzero
min 0.0002293 78 0.7655 0.001077 166
1se 0.0023471 53 0.7646 0.001041 124
Using an elastic net model, we’re achievingg an AUC of ~0.76
with a standard error of 0.002
with 92 dummy variables.
plot(mod1)
Let’s look at the significant coefficients for the 1se
model since its VERY close to the absolute minimum in terms of performance:
<- coef(mod1, "lambda.1se")
coef <- as.data.frame(as.matrix(coef))
coef_df $Predictor <- rownames(coef)
coef_dfcolnames(coef_df)[1] <- "Coefficient"
rownames(coef_df) <- NULL
|>
coef_df select(Predictor, Coefficient) |>
mutate(Important_Flag = ifelse(Coefficient == 0, 0, 1)) |>
arrange(desc(abs(Coefficient)))
Predictor Coefficient
1 (Intercept) 3.532973e+00
2 EXT_SOURCE_3 -3.328217e+00
3 EXT_SOURCE_2 -2.440205e+00
4 NAME_INCOME_TYPE.Unemployed 1.700417e+00
5 EXT_SOURCE_1 -1.580913e+00
6 NAME_INCOME_TYPE.Student -1.050433e+00
7 ORGANIZATION_TYPE.Realtor 9.793680e-01
8 NAME_EDUCATION_TYPE.Academic.degree -9.391707e-01
9 ORGANIZATION_TYPE.Industry..type.5 -5.920880e-01
10 ORGANIZATION_TYPE.Transport..type.3 5.838330e-01
11 ORGANIZATION_TYPE.Legal.Services 5.360799e-01
12 OCCUPATION_TYPE.Low.skill.Laborers 4.543564e-01
13 ORGANIZATION_TYPE.Military -4.502721e-01
14 NAME_TYPE_SUITE.Group.of.people 4.337982e-01
15 ORGANIZATION_TYPE.Industry..type.9 -4.162297e-01
16 ORGANIZATION_TYPE.Trade..type.2 -3.798490e-01
17 ORGANIZATION_TYPE.Security.Ministries -3.638699e-01
18 ORGANIZATION_TYPE.Transport..type.1 -3.561223e-01
19 ORGANIZATION_TYPE.Trade..type.6 -3.210172e-01
20 ORGANIZATION_TYPE.Police -3.125541e-01
21 OCCUPATION_TYPE.IT.staff -2.786280e-01
22 ORGANIZATION_TYPE.Industry..type.12 -2.743194e-01
23 ORGANIZATION_TYPE.Bank -2.523596e-01
24 ORGANIZATION_TYPE.Trade..type.4 -2.432408e-01
25 NAME_INCOME_TYPE.Maternity.leave 2.323895e-01
26 ORGANIZATION_TYPE.Construction 2.300053e-01
27 ORGANIZATION_TYPE.Mobile 2.273055e-01
28 IMPUTED_EXT2.N -2.231258e-01
29 ORGANIZATION_TYPE.Industry..type.2 -2.097668e-01
30 FLAG_OWN_CAR.N 2.033758e-01
31 OCCUPATION_TYPE.Core.staff -2.019235e-01
32 FLAG_OWN_CAR.Y -1.934463e-01
33 NAME_EDUCATION_TYPE.Higher.education -1.933103e-01
34 IMPUTED_EXT2.Y 1.926271e-01
35 OCCUPATION_TYPE.Private.service.staff -1.882990e-01
36 ORGANIZATION_TYPE.Emergency -1.873505e-01
37 ORGANIZATION_TYPE.Industry..type.3 1.851995e-01
38 ORGANIZATION_TYPE.Self.employed 1.774471e-01
39 REGION_RATING_CLIENT_W_CITY 1.698267e-01
40 ORGANIZATION_TYPE.Restaurant 1.682775e-01
41 GENDER_MALE.N -1.668034e-01
42 GENDER_MALE.Y 1.632994e-01
43 OCCUPATION_TYPE.Medicine.staff -1.612532e-01
44 ORGANIZATION_TYPE.Cleaning 1.607989e-01
45 NAME_EDUCATION_TYPE.Secondary...secondary.special 1.601284e-01
46 CASH_LOAN.Y 1.562187e-01
47 DEF_30_CNT_SOCIAL_CIRCLE 1.554303e-01
48 NAME_INCOME_TYPE.State.servant -1.490702e-01
49 CASH_LOAN.N -1.486173e-01
50 OCCUPATION_TYPE.Drivers 1.469576e-01
51 NAME_FAMILY_STATUS.Married -1.443223e-01
52 NAME_EDUCATION_TYPE.Lower.secondary 1.429962e-01
53 OCCUPATION_TYPE.Accountants -1.416987e-01
54 AMT_REQ_CREDIT_BUREAU_DAY 1.271783e-01
55 OCCUPATION_TYPE.Security.staff 1.213192e-01
56 ORGANIZATION_TYPE.Business.Entity.Type.1 -1.152365e-01
57 IMPUTED_EXT1.N -1.100268e-01
58 IMPUTED_EXT1.Y 1.073900e-01
59 IMPUTED_EXT3.N -1.057261e-01
60 IMPUTED_EXT3.Y 1.041181e-01
61 ORGANIZATION_TYPE.Trade..type.3 1.031059e-01
62 REG_CITY_NOT_LIVE_CITY.N -9.851101e-02
63 NAME_HOUSING_TYPE.Office.apartment -9.743842e-02
64 REG_CITY_NOT_LIVE_CITY.Y 9.487028e-02
65 AMT_REQ_CREDIT_BUREAU_QRT -9.074582e-02
66 OCCUPATION_TYPE.Managers -8.839256e-02
67 ORGANIZATION_TYPE.Business.Entity.Type.3 7.908698e-02
68 NAME_HOUSING_TYPE.Municipal.apartment 7.476591e-02
69 ORGANIZATION_TYPE.Postal 7.392358e-02
70 REG_REGION_NOT_LIVE_REGION.N 7.147059e-02
71 OCCUPATION_TYPE.High.skill.tech.staff -6.999488e-02
72 ORGANIZATION_TYPE.Transport..type.2 -6.846373e-02
73 ORGANIZATION_TYPE.Security -6.682691e-02
74 ORGANIZATION_TYPE.Industry..type.4 6.675468e-02
75 NAME_FAMILY_STATUS.Widow -6.634150e-02
76 FLAG_WORK_PHONE.N -6.582503e-02
77 OCCUPATION_TYPE.Laborers 6.483334e-02
78 ORGANIZATION_TYPE.Electricity -6.147942e-02
79 WEEKDAY_APPR_PROCESS_START.MONDAY -6.109460e-02
80 WEEKDAY_APPR_PROCESS_START.SUNDAY -6.059389e-02
81 WEEKDAY_APPR_PROCESS_START.SATURDAY -6.033814e-02
82 ORGANIZATION_TYPE.Industry..type.1 5.986108e-02
83 ORGANIZATION_TYPE.Religion -5.701555e-02
84 REG_REGION_NOT_LIVE_REGION.Y -5.628501e-02
85 NAME_HOUSING_TYPE.Rented.apartment 5.555295e-02
86 FLAG_WORK_PHONE.Y 5.524999e-02
87 NAME_INCOME_TYPE.Commercial.associate -5.471518e-02
88 ORGANIZATION_TYPE.School -4.939896e-02
89 CNT_CHILDREN 4.809819e-02
90 AMT_REQ_CREDIT_BUREAU_WEEK -4.634270e-02
91 OCCUPATION_TYPE.Sales.staff 4.478986e-02
92 DEF_60_CNT_SOCIAL_CIRCLE 4.473631e-02
93 ORGANIZATION_TYPE.Housing -4.291177e-02
94 NAME_INCOME_TYPE.Working 4.098379e-02
95 ORGANIZATION_TYPE.Medicine -3.927555e-02
96 FLAG_PHONE.N 3.743318e-02
97 FLAG_PHONE.Y -3.621675e-02
98 NAME_HOUSING_TYPE.House...apartment -3.229075e-02
99 NAME_TYPE_SUITE.Spouse..partner -3.082867e-02
100 NAME_FAMILY_STATUS.Separated 2.995004e-02
101 WEEKDAY_APPR_PROCESS_START.TUESDAY 2.936577e-02
102 ORGANIZATION_TYPE.Industry..type.11 -2.885678e-02
103 REG_CITY_NOT_WORK_CITY.N -2.730917e-02
104 REG_CITY_NOT_WORK_CITY.Y 2.610276e-02
105 NAME_HOUSING_TYPE.Co.op.apartment 2.295270e-02
106 ORGANIZATION_TYPE.Services -2.233410e-02
107 AMT_REQ_CREDIT_BUREAU_MON -2.198528e-02
108 OCCUPATION_TYPE.Cleaning.staff 2.136781e-02
109 NAME_TYPE_SUITE.Children -1.966768e-02
110 OCCUPATION_TYPE.XNA -1.889462e-02
111 NAME_FAMILY_STATUS.Civil.marriage 1.869274e-02
112 FLAG_OWN_REALTY.N -1.296698e-02
113 WEEKDAY_APPR_PROCESS_START.FRIDAY 1.137110e-02
114 FLAG_OWN_REALTY.Y 1.033040e-02
115 OCCUPATION_TYPE.Cooking.staff 9.017712e-03
116 OWN_CAR_AGE 8.102332e-03
117 HOUR_APPR_PROCESS_START -4.670135e-03
118 ORGANIZATION_TYPE.Kindergarten -3.996302e-04
119 DAYS_LAST_PHONE_CHANGE 6.390872e-05
120 DAYS_ID_PUBLISH 3.215159e-05
121 DAYS_REGISTRATION 1.740589e-05
122 DAYS_BIRTH 1.523334e-05
123 AMT_ANNUITY 5.557808e-06
124 AMT_GOODS_PRICE -7.107722e-07
125 AMT_CREDIT 6.435503e-07
126 AMT_INCOME_TOTAL 0.000000e+00
127 NAME_TYPE_SUITE.Family 0.000000e+00
128 NAME_TYPE_SUITE.Other_A 0.000000e+00
129 NAME_TYPE_SUITE.Other_B 0.000000e+00
130 NAME_TYPE_SUITE.Unaccompanied 0.000000e+00
131 NAME_INCOME_TYPE.Businessman 0.000000e+00
132 NAME_INCOME_TYPE.Pensioner 0.000000e+00
133 NAME_EDUCATION_TYPE.Incomplete.higher 0.000000e+00
134 NAME_FAMILY_STATUS.Single...not.married 0.000000e+00
135 NAME_HOUSING_TYPE.With.parents 0.000000e+00
136 REGION_POPULATION_RELATIVE 0.000000e+00
137 DAYS_EMPLOYED 0.000000e+00
138 FLAG_EMP_PHONE.N 0.000000e+00
139 FLAG_EMP_PHONE.Y 0.000000e+00
140 FLAG_CONT_MOBILE.N 0.000000e+00
141 FLAG_CONT_MOBILE.Y 0.000000e+00
142 FLAG_EMAIL.N 0.000000e+00
143 FLAG_EMAIL.Y 0.000000e+00
144 OCCUPATION_TYPE.HR.staff 0.000000e+00
145 OCCUPATION_TYPE.Realty.agents 0.000000e+00
146 OCCUPATION_TYPE.Secretaries 0.000000e+00
147 OCCUPATION_TYPE.Waiters.barmen.staff 0.000000e+00
148 CNT_FAM_MEMBERS 0.000000e+00
149 REGION_RATING_CLIENT 0.000000e+00
150 WEEKDAY_APPR_PROCESS_START.THURSDAY 0.000000e+00
151 WEEKDAY_APPR_PROCESS_START.WEDNESDAY 0.000000e+00
152 REG_REGION_NOT_WORK_REGION.N 0.000000e+00
153 REG_REGION_NOT_WORK_REGION.Y 0.000000e+00
154 LIVE_REGION_NOT_WORK_REGION.N 0.000000e+00
155 LIVE_REGION_NOT_WORK_REGION.Y 0.000000e+00
156 LIVE_CITY_NOT_WORK_CITY.N 0.000000e+00
157 LIVE_CITY_NOT_WORK_CITY.Y 0.000000e+00
158 ORGANIZATION_TYPE.Advertising 0.000000e+00
159 ORGANIZATION_TYPE.Agriculture 0.000000e+00
160 ORGANIZATION_TYPE.Business.Entity.Type.2 0.000000e+00
161 ORGANIZATION_TYPE.Culture 0.000000e+00
162 ORGANIZATION_TYPE.Government 0.000000e+00
163 ORGANIZATION_TYPE.Hotel 0.000000e+00
164 ORGANIZATION_TYPE.Industry..type.10 0.000000e+00
165 ORGANIZATION_TYPE.Industry..type.13 0.000000e+00
166 ORGANIZATION_TYPE.Industry..type.6 0.000000e+00
167 ORGANIZATION_TYPE.Industry..type.7 0.000000e+00
168 ORGANIZATION_TYPE.Industry..type.8 0.000000e+00
169 ORGANIZATION_TYPE.Insurance 0.000000e+00
170 ORGANIZATION_TYPE.Other 0.000000e+00
171 ORGANIZATION_TYPE.Telecom 0.000000e+00
172 ORGANIZATION_TYPE.Trade..type.1 0.000000e+00
173 ORGANIZATION_TYPE.Trade..type.5 0.000000e+00
174 ORGANIZATION_TYPE.Trade..type.7 0.000000e+00
175 ORGANIZATION_TYPE.Transport..type.4 0.000000e+00
176 ORGANIZATION_TYPE.University 0.000000e+00
177 ORGANIZATION_TYPE.XNA 0.000000e+00
178 OBS_30_CNT_SOCIAL_CIRCLE 0.000000e+00
179 OBS_60_CNT_SOCIAL_CIRCLE 0.000000e+00
180 AMT_REQ_CREDIT_BUREAU_HOUR 0.000000e+00
181 AMT_REQ_CREDIT_BUREAU_YEAR 0.000000e+00
Important_Flag
1 1
2 1
3 1
4 1
5 1
6 1
7 1
8 1
9 1
10 1
11 1
12 1
13 1
14 1
15 1
16 1
17 1
18 1
19 1
20 1
21 1
22 1
23 1
24 1
25 1
26 1
27 1
28 1
29 1
30 1
31 1
32 1
33 1
34 1
35 1
36 1
37 1
38 1
39 1
40 1
41 1
42 1
43 1
44 1
45 1
46 1
47 1
48 1
49 1
50 1
51 1
52 1
53 1
54 1
55 1
56 1
57 1
58 1
59 1
60 1
61 1
62 1
63 1
64 1
65 1
66 1
67 1
68 1
69 1
70 1
71 1
72 1
73 1
74 1
75 1
76 1
77 1
78 1
79 1
80 1
81 1
82 1
83 1
84 1
85 1
86 1
87 1
88 1
89 1
90 1
91 1
92 1
93 1
94 1
95 1
96 1
97 1
98 1
99 1
100 1
101 1
102 1
103 1
104 1
105 1
106 1
107 1
108 1
109 1
110 1
111 1
112 1
113 1
114 1
115 1
116 1
117 1
118 1
119 1
120 1
121 1
122 1
123 1
124 1
125 1
126 0
127 0
128 0
129 0
130 0
131 0
132 0
133 0
134 0
135 0
136 0
137 0
138 0
139 0
140 0
141 0
142 0
143 0
144 0
145 0
146 0
147 0
148 0
149 0
150 0
151 0
152 0
153 0
154 0
155 0
156 0
157 0
158 0
159 0
160 0
161 0
162 0
163 0
164 0
165 0
166 0
167 0
168 0
169 0
170 0
171 0
172 0
173 0
174 0
175 0
176 0
177 0
178 0
179 0
180 0
181 0
The external source variables are highly important and predictive.
Saving model results
Let’s use the model we found to be the best and predict new probabilities of the classification.
<- predict(mod1, newx = test_predictors, s = 'lambda.1se', type = 'response') pred_probs
Now, we need to find the ideal threshold for classification of the probabilities:
<- roc(ifelse(test_target == "Y", 1, 0), pred_probs)
roc_obj <- roc_obj$sensitivities + roc_obj$specificities - 1
youden_index
<- which.max(youden_index)
optimal_index <- roc_obj$thresholds[optimal_index]
optimal_threshold optimal_threshold
[1] 0.4280494
We can now use this optimal threshold for the confusion matrix:
<- confusionMatrix(
mat1 factor(ifelse(pred_probs > optimal_threshold, 1, 0), levels = c(1,0)),
factor(ifelse(test_target == "Y", 1, 0), levels = c(1,0))
)
mat1
Confusion Matrix and Statistics
Reference
Prediction 1 0
1 13538 6981
0 3526 10659
Accuracy : 0.6972
95% CI : (0.6924, 0.7021)
No Information Rate : 0.5083
P-Value [Acc > NIR] : < 2.2e-16
Kappa : 0.3963
Mcnemar's Test P-Value : < 2.2e-16
Sensitivity : 0.7934
Specificity : 0.6043
Pos Pred Value : 0.6598
Neg Pred Value : 0.7514
Prevalence : 0.4917
Detection Rate : 0.3901
Detection Prevalence : 0.5913
Balanced Accuracy : 0.6988
'Positive' Class : 1
Let’s also grab the AUC value:
<- auc(roc_obj)
auc1 auc1
Area under the curve: 0.762
Let’s compile all these metrics into a dataframe and save the results:
<- c(
performance c("model" = "Elastic Net"),
c("hyperparameters" = paste(
paste("Lambda:", mod1$lambda.1se),
paste("Mix:", 0.5),
paste("Cutoff:", optimal_threshold),
collapse = ", "
)), $overall[c("Accuracy")],
mat1$byClass[c("Precision", "Recall")],
mat1c("AUC" = auc1)
)
performance
model
"Elastic Net"
hyperparameters
"Lambda: 0.002347112488074 Mix: 0.5 Cutoff: 0.428049365448963"
Accuracy
"0.697239511295528"
Precision
"0.659778741654077"
Recall
"0.793366150961088"
AUC
"0.761974812975667"
data.frame(as.list(performance)) |>
write.csv('models/penalized-regression/model-results.csv', row.names = FALSE)
Tuned Modeling
The above, basic model assumes a perfect 50-50 elastic net. It could be, however, a better tuned model could improve performance. We test a few different values of alpha
(the elastic net mix) to see if we can do any better. We will reduce the number of folds to help it run faster:
<- seq(0.2, 0.8, 0.2)
mix <- list()
results
for(m in mix) {
<- cv.glmnet(
t_mod x = train_predictors,
y = train_target,
family = "binomial",
nfolds = 3,
alpha = m,
type.measure = "auc"
)
length(results) + 1]] <- c(m, t_mod$cvm[t_mod$index])
results[[ }
<- as.data.frame(do.call(rbind, results))
results_df names(results_df) <- c("mix", "AUC_lambda.min", "AUC_lambda.1se")
results_df
mix AUC_lambda.min AUC_lambda.1se
1 0.2 0.7652886 0.7650394
2 0.4 0.7654275 0.7638517
3 0.6 0.7652099 0.7649984
4 0.8 0.7650149 0.7636397
We aren’t achieving really any better or worse results with a weighted elastic net. The next step would be determining if some interaction terms could improve things.
Interaction terms
We’re going to create some interaction terms, something to indicate a varying relationship between predictors.
We’ll implement the following interactions, largely informed by data analysis work and what was found above to be important predictors in estimating default:
CNT_CHILDREN
andAMT_CREDIT
AMT_ANNUITY
andAMT_CREDIT
DAYS_LAST_PHONE_CHANGE
andAMT_REQ_CREDIT_BUREAU_DAY
REGION_RATING_CLIENT_W_CITY
andDEF_30_CNT_SOCIAL_CIRCLE
AMT_REQ_CREDIT_BUREAU_DAY
andAMT_CREDIT
AMT_CREDIT
andAMT_GOODS_PRICE
EXT_SOURCE_#
andIMPUTED_EXT#
<- function(data) {
add_interactions <-
alt_data |>
data mutate(
I_CHILDREN_X_CREDIT = CNT_CHILDREN * AMT_CREDIT,
I_ANNUITY_X_CREDIT = AMT_ANNUITY * AMT_CREDIT,
I_PHONE_CHANGE_X_CREDIT_BUREAU_DAY = DAYS_LAST_PHONE_CHANGE * AMT_REQ_CREDIT_BUREAU_DAY,
I_REGION_RATING_X_30_SOCIAL = REGION_RATING_CLIENT_W_CITY * DEF_30_CNT_SOCIAL_CIRCLE,
I_BUREAU_DAY_X_CREDIT = AMT_REQ_CREDIT_BUREAU_DAY * AMT_CREDIT,
I_CREDIT_X_GOODS = AMT_CREDIT * AMT_GOODS_PRICE,
I_EXT1_X_IMP1 = EXT_SOURCE_1 * IMPUTED_EXT1.Y,
I_EXT2_X_IMP2 = EXT_SOURCE_2 * IMPUTED_EXT2.Y,
I_EXT3_X_IMP3 = EXT_SOURCE_3 * IMPUTED_EXT3.Y
)return(alt_data)
}
<- add_interactions(train_sampl)
alt_train_sampl <- add_interactions(test_sampl) alt_test_sampl
<- as.matrix(
alt_train_predictors -which(names(alt_train_sampl) %in% c("DEFAULT"))]
alt_train_sampl[, )
Let’s now use the interaction variables in another cross validated model:
<- cv.glmnet(
mod2 x = alt_train_predictors,
y = train_target,
family = "binomial",
alpha = 0.5,
type.measure = "auc"
)
mod2
Call: cv.glmnet(x = alt_train_predictors, y = train_target, type.measure = "auc", family = "binomial", alpha = 0.5)
Measure: AUC
Lambda Index Measure SE Nonzero
min 0.0001196 85 0.7834 0.001855 176
1se 0.0004398 71 0.7817 0.001859 165
The model has improved AUC a touch and decreased the standard error. However, it’s drastically increased the complexity of the model by nearly double. Probably not what is wanted.
Let’s see if any of these interactions show up as important predictors:
<- coef(mod2, "lambda.1se")
coef <- as.data.frame(as.matrix(coef))
coef_df $Predictor <- rownames(coef)
coef_dfcolnames(coef_df)[1] <- "Coefficient"
rownames(coef_df) <- NULL
|>
coef_df select(Predictor, Coefficient) |>
mutate(Important_Flag = ifelse(Coefficient == 0, 0, 1)) |>
arrange(desc(abs(Coefficient)))
Predictor Coefficient
1 I_EXT3_X_IMP3 -1.528889e+01
2 (Intercept) 1.375993e+01
3 I_EXT1_X_IMP1 -1.110790e+01
4 I_EXT2_X_IMP2 -8.576275e+00
5 IMPUTED_EXT3.N -4.591983e+00
6 IMPUTED_EXT3.Y 3.704059e+00
7 IMPUTED_EXT2.N -3.624469e+00
8 IMPUTED_EXT1.N -3.195336e+00
9 EXT_SOURCE_3 -3.055277e+00
10 NAME_INCOME_TYPE.Student -2.792424e+00
11 IMPUTED_EXT1.Y 2.576911e+00
12 NAME_INCOME_TYPE.Unemployed 2.412100e+00
13 EXT_SOURCE_2 -2.399968e+00
14 IMPUTED_EXT2.Y 1.600262e+00
15 NAME_INCOME_TYPE.Maternity.leave 1.445417e+00
16 NAME_EDUCATION_TYPE.Academic.degree -1.223774e+00
17 ORGANIZATION_TYPE.Realtor 1.173096e+00
18 EXT_SOURCE_1 -1.088912e+00
19 NAME_INCOME_TYPE.Businessman -8.234895e-01
20 ORGANIZATION_TYPE.Legal.Services 7.208582e-01
21 ORGANIZATION_TYPE.Industry..type.5 -6.981610e-01
22 ORGANIZATION_TYPE.Transport..type.3 6.345650e-01
23 NAME_TYPE_SUITE.Group.of.people 6.109301e-01
24 ORGANIZATION_TYPE.Trade..type.4 -5.852989e-01
25 ORGANIZATION_TYPE.Transport..type.1 -5.815638e-01
26 ORGANIZATION_TYPE.Military -4.990645e-01
27 OCCUPATION_TYPE.Low.skill.Laborers 4.780323e-01
28 ORGANIZATION_TYPE.Trade..type.6 -4.615526e-01
29 ORGANIZATION_TYPE.Industry..type.9 -4.571582e-01
30 ORGANIZATION_TYPE.Mobile 4.538192e-01
31 ORGANIZATION_TYPE.Security.Ministries -4.303846e-01
32 ORGANIZATION_TYPE.Trade..type.2 -4.232685e-01
33 ORGANIZATION_TYPE.Industry..type.12 -4.097099e-01
34 OCCUPATION_TYPE.IT.staff -3.945385e-01
35 ORGANIZATION_TYPE.Police -3.652062e-01
36 ORGANIZATION_TYPE.Cleaning 3.494770e-01
37 ORGANIZATION_TYPE.Industry..type.2 -3.383467e-01
38 ORGANIZATION_TYPE.Bank -3.046893e-01
39 ORGANIZATION_TYPE.Religion -2.909659e-01
40 ORGANIZATION_TYPE.Emergency -2.804705e-01
41 ORGANIZATION_TYPE.Industry..type.6 -2.741116e-01
42 ORGANIZATION_TYPE.Construction 2.595581e-01
43 REGION_RATING_CLIENT_W_CITY 2.586392e-01
44 REGION_POPULATION_RELATIVE 2.316930e-01
45 ORGANIZATION_TYPE.Industry..type.3 2.251561e-01
46 AMT_REQ_CREDIT_BUREAU_DAY 2.166025e-01
47 FLAG_OWN_CAR.N 2.109498e-01
48 FLAG_OWN_CAR.Y -2.094914e-01
49 OCCUPATION_TYPE.Private.service.staff -2.044144e-01
50 ORGANIZATION_TYPE.Restaurant 1.921155e-01
51 OCCUPATION_TYPE.Security.staff 1.909153e-01
52 OCCUPATION_TYPE.Core.staff -1.902491e-01
53 ORGANIZATION_TYPE.Self.employed 1.834250e-01
54 NAME_EDUCATION_TYPE.Lower.secondary 1.765198e-01
55 NAME_EDUCATION_TYPE.Secondary...secondary.special 1.739882e-01
56 GENDER_MALE.N -1.717073e-01
57 GENDER_MALE.Y 1.707808e-01
58 ORGANIZATION_TYPE.Security -1.686329e-01
59 OCCUPATION_TYPE.Drivers 1.639389e-01
60 OCCUPATION_TYPE.Medicine.staff -1.602759e-01
61 OCCUPATION_TYPE.HR.staff 1.591153e-01
62 DEF_30_CNT_SOCIAL_CIRCLE 1.563523e-01
63 NAME_EDUCATION_TYPE.Higher.education -1.558138e-01
64 NAME_FAMILY_STATUS.Married -1.551475e-01
65 OCCUPATION_TYPE.Accountants -1.549006e-01
66 ORGANIZATION_TYPE.Industry..type.1 1.454340e-01
67 ORGANIZATION_TYPE.Business.Entity.Type.1 -1.422452e-01
68 ORGANIZATION_TYPE.Industry..type.4 1.383667e-01
69 ORGANIZATION_TYPE.Postal 1.372528e-01
70 ORGANIZATION_TYPE.Trade..type.3 1.277188e-01
71 OCCUPATION_TYPE.Secretaries 1.254356e-01
72 ORGANIZATION_TYPE.Electricity -1.227910e-01
73 ORGANIZATION_TYPE.Transport..type.2 -1.144199e-01
74 ORGANIZATION_TYPE.Housing -1.106148e-01
75 ORGANIZATION_TYPE.Trade..type.1 1.082446e-01
76 NAME_INCOME_TYPE.Working 1.052300e-01
77 NAME_HOUSING_TYPE.Office.apartment -1.042943e-01
78 CASH_LOAN.Y 1.010982e-01
79 NAME_HOUSING_TYPE.Rented.apartment 1.008204e-01
80 NAME_HOUSING_TYPE.Municipal.apartment 1.007539e-01
81 AMT_REQ_CREDIT_BUREAU_QRT -9.825317e-02
82 REG_CITY_NOT_LIVE_CITY.N -9.747794e-02
83 REG_CITY_NOT_LIVE_CITY.Y 9.743638e-02
84 ORGANIZATION_TYPE.Culture 9.596025e-02
85 ORGANIZATION_TYPE.School -9.566730e-02
86 CASH_LOAN.N -9.404510e-02
87 REG_REGION_NOT_LIVE_REGION.N 9.296861e-02
88 OCCUPATION_TYPE.Cleaning.staff 9.203428e-02
89 OCCUPATION_TYPE.Managers -8.959269e-02
90 REG_REGION_NOT_LIVE_REGION.Y -8.907418e-02
91 ORGANIZATION_TYPE.Business.Entity.Type.3 8.837468e-02
92 FLAG_WORK_PHONE.N -8.780171e-02
93 ORGANIZATION_TYPE.Services -8.570895e-02
94 NAME_FAMILY_STATUS.Widow -8.472409e-02
95 FLAG_WORK_PHONE.Y 8.419445e-02
96 REGION_RATING_CLIENT -8.166804e-02
97 OCCUPATION_TYPE.High.skill.tech.staff -8.124010e-02
98 NAME_INCOME_TYPE.State.servant -8.000448e-02
99 AMT_REQ_CREDIT_BUREAU_WEEK -7.937564e-02
100 NAME_HOUSING_TYPE.Co.op.apartment 7.728171e-02
101 NAME_TYPE_SUITE.Children -7.409382e-02
102 OCCUPATION_TYPE.Laborers 7.375717e-02
103 WEEKDAY_APPR_PROCESS_START.SATURDAY -7.182034e-02
104 WEEKDAY_APPR_PROCESS_START.SUNDAY -7.055514e-02
105 FLAG_CONT_MOBILE.N 6.510510e-02
106 ORGANIZATION_TYPE.Hotel -6.495363e-02
107 WEEKDAY_APPR_PROCESS_START.MONDAY -6.437031e-02
108 NAME_TYPE_SUITE.Other_A -6.370491e-02
109 OCCUPATION_TYPE.Sales.staff 6.077562e-02
110 ORGANIZATION_TYPE.Medicine -6.022675e-02
111 ORGANIZATION_TYPE.Industry..type.10 6.011791e-02
112 NAME_TYPE_SUITE.Spouse..partner -5.941819e-02
113 CNT_CHILDREN 5.844177e-02
114 FLAG_CONT_MOBILE.Y -5.544720e-02
115 DEF_60_CNT_SOCIAL_CIRCLE 5.452353e-02
116 WEEKDAY_APPR_PROCESS_START.TUESDAY 5.119699e-02
117 NAME_FAMILY_STATUS.Separated 5.064683e-02
118 ORGANIZATION_TYPE.Industry..type.11 -4.896470e-02
119 OCCUPATION_TYPE.Cooking.staff 4.145144e-02
120 FLAG_PHONE.N 4.015386e-02
121 FLAG_PHONE.Y -3.969318e-02
122 AMT_REQ_CREDIT_BUREAU_HOUR -3.439327e-02
123 ORGANIZATION_TYPE.Kindergarten -3.306421e-02
124 OCCUPATION_TYPE.XNA -3.237406e-02
125 WEEKDAY_APPR_PROCESS_START.FRIDAY 3.092718e-02
126 ORGANIZATION_TYPE.Agriculture 3.032133e-02
127 REG_CITY_NOT_WORK_CITY.N -2.901355e-02
128 REG_CITY_NOT_WORK_CITY.Y 2.889684e-02
129 ORGANIZATION_TYPE.Insurance -2.868275e-02
130 NAME_HOUSING_TYPE.House...apartment -2.623315e-02
131 AMT_REQ_CREDIT_BUREAU_MON -2.579244e-02
132 ORGANIZATION_TYPE.Industry..type.8 -2.279904e-02
133 NAME_FAMILY_STATUS.Civil.marriage 2.043056e-02
134 OCCUPATION_TYPE.Waiters.barmen.staff 1.827983e-02
135 FLAG_OWN_REALTY.N -1.539554e-02
136 FLAG_OWN_REALTY.Y 1.520742e-02
137 WEEKDAY_APPR_PROCESS_START.WEDNESDAY 1.514343e-02
138 ORGANIZATION_TYPE.University 1.092346e-02
139 OWN_CAR_AGE 9.360714e-03
140 ORGANIZATION_TYPE.Business.Entity.Type.2 8.027964e-03
141 ORGANIZATION_TYPE.Trade..type.7 6.660998e-03
142 HOUR_APPR_PROCESS_START -4.934847e-03
143 ORGANIZATION_TYPE.Other 4.932404e-03
144 LIVE_REGION_NOT_WORK_REGION.N -4.255084e-03
145 AMT_REQ_CREDIT_BUREAU_YEAR 3.402660e-03
146 FLAG_EMAIL.N 3.045619e-03
147 ORGANIZATION_TYPE.Trade..type.5 -3.025815e-03
148 ORGANIZATION_TYPE.Telecom -2.636656e-03
149 LIVE_REGION_NOT_WORK_REGION.Y 2.237436e-03
150 FLAG_EMAIL.Y -2.097862e-03
151 NAME_TYPE_SUITE.Other_B 9.427455e-04
152 OBS_30_CNT_SOCIAL_CIRCLE 6.799077e-04
153 DAYS_LAST_PHONE_CHANGE 6.467428e-05
154 I_PHONE_CHANGE_X_CREDIT_BUREAU_DAY 3.415919e-05
155 DAYS_ID_PUBLISH 3.362819e-05
156 DAYS_REGISTRATION 1.781073e-05
157 DAYS_BIRTH 1.489616e-05
158 AMT_ANNUITY 1.434204e-05
159 AMT_CREDIT 2.053306e-06
160 AMT_GOODS_PRICE -1.800457e-06
161 DAYS_EMPLOYED 4.912155e-07
162 I_BUREAU_DAY_X_CREDIT 5.910048e-08
163 AMT_INCOME_TOTAL 2.161212e-08
164 I_CHILDREN_X_CREDIT -1.652219e-08
165 I_ANNUITY_X_CREDIT -1.165911e-11
166 I_CREDIT_X_GOODS 1.062839e-14
167 NAME_TYPE_SUITE.Family 0.000000e+00
168 NAME_TYPE_SUITE.Unaccompanied 0.000000e+00
169 NAME_INCOME_TYPE.Commercial.associate 0.000000e+00
170 NAME_INCOME_TYPE.Pensioner 0.000000e+00
171 NAME_EDUCATION_TYPE.Incomplete.higher 0.000000e+00
172 NAME_FAMILY_STATUS.Single...not.married 0.000000e+00
173 NAME_HOUSING_TYPE.With.parents 0.000000e+00
174 FLAG_EMP_PHONE.N 0.000000e+00
175 FLAG_EMP_PHONE.Y 0.000000e+00
176 OCCUPATION_TYPE.Realty.agents 0.000000e+00
177 CNT_FAM_MEMBERS 0.000000e+00
178 WEEKDAY_APPR_PROCESS_START.THURSDAY 0.000000e+00
179 REG_REGION_NOT_WORK_REGION.N 0.000000e+00
180 REG_REGION_NOT_WORK_REGION.Y 0.000000e+00
181 LIVE_CITY_NOT_WORK_CITY.N 0.000000e+00
182 LIVE_CITY_NOT_WORK_CITY.Y 0.000000e+00
183 ORGANIZATION_TYPE.Advertising 0.000000e+00
184 ORGANIZATION_TYPE.Government 0.000000e+00
185 ORGANIZATION_TYPE.Industry..type.13 0.000000e+00
186 ORGANIZATION_TYPE.Industry..type.7 0.000000e+00
187 ORGANIZATION_TYPE.Transport..type.4 0.000000e+00
188 ORGANIZATION_TYPE.XNA 0.000000e+00
189 OBS_60_CNT_SOCIAL_CIRCLE 0.000000e+00
190 I_REGION_RATING_X_30_SOCIAL 0.000000e+00
Important_Flag
1 1
2 1
3 1
4 1
5 1
6 1
7 1
8 1
9 1
10 1
11 1
12 1
13 1
14 1
15 1
16 1
17 1
18 1
19 1
20 1
21 1
22 1
23 1
24 1
25 1
26 1
27 1
28 1
29 1
30 1
31 1
32 1
33 1
34 1
35 1
36 1
37 1
38 1
39 1
40 1
41 1
42 1
43 1
44 1
45 1
46 1
47 1
48 1
49 1
50 1
51 1
52 1
53 1
54 1
55 1
56 1
57 1
58 1
59 1
60 1
61 1
62 1
63 1
64 1
65 1
66 1
67 1
68 1
69 1
70 1
71 1
72 1
73 1
74 1
75 1
76 1
77 1
78 1
79 1
80 1
81 1
82 1
83 1
84 1
85 1
86 1
87 1
88 1
89 1
90 1
91 1
92 1
93 1
94 1
95 1
96 1
97 1
98 1
99 1
100 1
101 1
102 1
103 1
104 1
105 1
106 1
107 1
108 1
109 1
110 1
111 1
112 1
113 1
114 1
115 1
116 1
117 1
118 1
119 1
120 1
121 1
122 1
123 1
124 1
125 1
126 1
127 1
128 1
129 1
130 1
131 1
132 1
133 1
134 1
135 1
136 1
137 1
138 1
139 1
140 1
141 1
142 1
143 1
144 1
145 1
146 1
147 1
148 1
149 1
150 1
151 1
152 1
153 1
154 1
155 1
156 1
157 1
158 1
159 1
160 1
161 1
162 1
163 1
164 1
165 1
166 1
167 0
168 0
169 0
170 0
171 0
172 0
173 0
174 0
175 0
176 0
177 0
178 0
179 0
180 0
181 0
182 0
183 0
184 0
185 0
186 0
187 0
188 0
189 0
190 0
It looks as if the following were helpful predictors (though AUC was not improved):
- The interactions between external source and their imputed flags are very high; we’d expect to see this and its important they do so
- Others show up throughout but aren’t extremely notable
Conclusion
There’s too many unique relationships to explore for an optimal model in the OLS family, even with the penalized regression flavor. We’re likely tapped-out with an AUC of ~0.76 - 0.78.